Library Imports

library(tidyverse)      # Data manipulation and visualization
## Warning: package 'tidyverse' was built under R version 4.3.2
## Warning: package 'ggplot2' was built under R version 4.3.2
## Warning: package 'tidyr' was built under R version 4.3.2
## Warning: package 'dplyr' was built under R version 4.3.2
## Warning: package 'stringr' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table)     # Efficient data manipulation
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(skimr)          # Data summary statistics
library(rstudioapi)     # RStudio API for interactions with RStudio
library(inspectdf)      # DataFrame inspection
library(mice)           # Imputation of missing values
## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(plotly)         # Interactive plots
## Warning: package 'plotly' was built under R version 4.3.2
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(highcharter)    # Highcharts for R
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo 
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
library(recipes)        # Preprocessing of data
## 
## Attaching package: 'recipes'
## 
## The following object is masked from 'package:stringr':
## 
##     fixed
## 
## The following object is masked from 'package:stats':
## 
##     step
library(caret)          # Classification and regression training
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(purrr)          # Functional programming
library(graphics)       # Base R graphics
library(Hmisc)          # Miscellaneous functions
## 
## Attaching package: 'Hmisc'
## 
## The following object is masked from 'package:plotly':
## 
##     subplot
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(glue)           # String manipulation

library(h2o)            # H2O.ai for machine learning
## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------
## 
## 
## Attaching package: 'h2o'
## 
## The following objects are masked from 'package:data.table':
## 
##     hour, month, week, year
## 
## The following objects are masked from 'package:lubridate':
## 
##     day, hour, month, week, year
## 
## The following objects are masked from 'package:stats':
## 
##     cor, sd, var
## 
## The following objects are masked from 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

Data Loading and Exploration

# Load the raw data
raw <- fread("crimes.csv")

# Display a glimpse of the data
raw %>% glimpse()
## Rows: 1,994
## Columns: 20
## $ PctEmplProfServ     <dbl> 0.41, 0.15, 0.29, 0.45, 0.38, 0.77, 0.53, 0.34, 0.…
## $ PctOccupManu        <dbl> 0.25, 0.42, 0.49, 0.37, 0.42, 0.06, 0.33, 0.71, 0.…
## $ PctOccupMgmtProf    <dbl> 0.52, 0.36, 0.32, 0.39, 0.46, 0.91, 0.49, 0.18, 0.…
## $ MalePctDivorce      <dbl> 0.68, 1.00, 0.63, 0.34, 0.22, 0.49, 0.25, 0.38, 0.…
## $ MalePctNevMarr      <dbl> 0.40, 0.63, 0.41, 0.45, 0.27, 0.57, 0.34, 0.47, 0.…
## $ FemalePctDiv        <dbl> 0.75, 0.91, 0.71, 0.49, 0.20, 0.61, 0.28, 0.59, 0.…
## $ TotalPctDiv         <dbl> 0.75, 1.00, 0.70, 0.44, 0.21, 0.58, 0.28, 0.52, 0.…
## $ PersPerFam          <dbl> 0.35, 0.29, 0.45, 0.75, 0.51, 0.44, 0.42, 0.78, 0.…
## $ PctFam2Par          <dbl> 0.55, 0.43, 0.42, 0.65, 0.91, 0.62, 0.77, 0.45, 0.…
## $ PctKids2Par         <dbl> 0.59, 0.47, 0.44, 0.54, 0.91, 0.69, 0.81, 0.43, 0.…
## $ PctYoungKids2Par    <dbl> 0.61, 0.60, 0.43, 0.83, 0.89, 0.87, 0.79, 0.34, 0.…
## $ PctTeen2Par         <dbl> 0.56, 0.39, 0.43, 0.65, 0.85, 0.53, 0.74, 0.34, 0.…
## $ PctWorkMomYoungKids <dbl> 0.74, 0.46, 0.71, 0.85, 0.40, 0.30, 0.57, 0.29, 0.…
## $ PctWorkMom          <dbl> 0.76, 0.53, 0.67, 0.86, 0.60, 0.43, 0.62, 0.27, 0.…
## $ NumIlleg            <dbl> 0.04, 0.00, 0.01, 0.03, 0.00, 0.00, 0.00, 0.02, 0.…
## $ PctIlleg            <dbl> 0.14, 0.24, 0.46, 0.33, 0.06, 0.11, 0.13, 0.50, 0.…
## $ NumImmig            <dbl> 0.03, 0.01, 0.00, 0.02, 0.00, 0.04, 0.01, 0.02, 0.…
## $ PctImmigRecent      <dbl> 0.24, 0.52, 0.07, 0.11, 0.03, 0.30, 0.00, 0.50, 0.…
## $ PctImmigRec5        <dbl> 0.27, 0.62, 0.06, 0.20, 0.07, 0.35, 0.02, 0.59, 0.…
## $ ViolentCrimesPerPop <dbl> 0.20, 0.67, 0.43, 0.12, 0.03, 0.14, 0.03, 0.55, 0.…
# Check for missing values
raw %>% inspect_na()
## # A tibble: 20 × 3
##    col_name              cnt  pcnt
##    <chr>               <int> <dbl>
##  1 PctEmplProfServ         0     0
##  2 PctOccupManu            0     0
##  3 PctOccupMgmtProf        0     0
##  4 MalePctDivorce          0     0
##  5 MalePctNevMarr          0     0
##  6 FemalePctDiv            0     0
##  7 TotalPctDiv             0     0
##  8 PersPerFam              0     0
##  9 PctFam2Par              0     0
## 10 PctKids2Par             0     0
## 11 PctYoungKids2Par        0     0
## 12 PctTeen2Par             0     0
## 13 PctWorkMomYoungKids     0     0
## 14 PctWorkMom              0     0
## 15 NumIlleg                0     0
## 16 PctIlleg                0     0
## 17 NumImmig                0     0
## 18 PctImmigRecent          0     0
## 19 PctImmigRec5            0     0
## 20 ViolentCrimesPerPop     0     0
# View the entire dataset
View(raw)

# Extract numeric variable names
num_vars <- raw %>% select_if(is.numeric) %>% names()

# Display outliers for numeric variables
for (b in num_vars) {
  OutVals <- boxplot(raw[[b]])$out
  if (length(OutVals) > 0) {
    print(paste0("----", b))
    print(OutVals)
  }
}

## [1] "----PctEmplProfServ"
##  [1] 1.00 1.00 0.85 1.00 1.00 1.00 1.00 1.00 1.00 1.00 0.93 0.87 1.00 1.00 0.85
## [16] 0.89 1.00 0.94 1.00 0.89 1.00 1.00 1.00 0.88 0.90 1.00 1.00 1.00 0.88 1.00
## [31] 0.97 1.00 0.87 0.85 1.00 0.93 1.00 1.00 0.95 0.97 1.00 1.00 0.86 0.98 1.00
## [46] 0.90 0.94 0.93 1.00 0.00 0.99 0.88 0.97 1.00 0.96 1.00 0.93 0.86 0.96 1.00
## [61] 1.00 0.89 1.00 0.90 0.90 0.90 1.00 1.00

## [1] "----PctOccupManu"
##  [1] 1.00 0.99 0.99 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00
## [16] 1.00 0.97 0.92 1.00 1.00 0.99 1.00 1.00 1.00 0.96 1.00 0.93

## [1] "----PctOccupMgmtProf"
##  [1] 0.91 0.96 0.98 0.99 1.00 0.93 0.90 0.96 0.92 0.95 1.00 1.00 1.00 0.96 1.00
## [16] 1.00 0.97 1.00 0.94 0.90 0.91 0.99 1.00 1.00 0.93 0.91 0.90 0.89 0.92 0.91
## [31] 0.93 0.96 1.00 0.95 0.98 1.00 1.00 1.00 0.92 0.90 0.98 1.00 0.93 0.91 1.00
## [46] 0.96 0.94 1.00 1.00 1.00 1.00 0.99 1.00 0.91 1.00 0.91 1.00 0.98 0.90 1.00
## [61] 1.00

## [1] "----MalePctDivorce"
## [1] 1.00 1.00 1.00 0.98 1.00

## [1] "----MalePctNevMarr"
##   [1] 0.85 0.94 1.00 1.00 1.00 1.00 0.93 1.00 1.00 1.00 0.90 0.98 1.00 0.81 0.88
##  [16] 1.00 1.00 0.86 0.86 0.96 1.00 0.89 0.80 0.85 0.95 1.00 0.84 1.00 0.80 1.00
##  [31] 0.83 0.96 1.00 1.00 1.00 1.00 1.00 0.79 1.00 0.83 0.96 1.00 0.85 1.00 1.00
##  [46] 1.00 0.97 1.00 0.88 1.00 0.81 0.79 0.84 1.00 0.84 0.92 1.00 0.91 0.83 0.02
##  [61] 1.00 1.00 0.80 0.81 0.97 1.00 0.87 0.79 0.79 0.91 0.96 0.97 0.91 0.93 0.92
##  [76] 1.00 1.00 0.79 0.94 1.00 0.99 0.95 0.83 0.90 0.84 1.00 1.00 0.86 1.00 0.00
##  [91] 1.00 0.84 0.80 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 0.84 1.00 0.81 0.82
## [106] 0.85 0.87

## [1] "----PersPerFam"
##   [1] 0.85 1.00 0.14 0.83 0.90 0.82 0.97 1.00 0.97 1.00 1.00 0.85 0.97 0.93 1.00
##  [16] 1.00 0.83 0.84 0.85 1.00 0.01 0.90 0.88 0.92 1.00 1.00 1.00 0.92 1.00 1.00
##  [31] 0.95 0.88 0.85 1.00 1.00 1.00 1.00 1.00 0.84 1.00 1.00 0.90 0.82 0.10 0.90
##  [46] 0.06 0.11 0.87 0.81 0.86 1.00 0.14 0.98 0.08 0.90 1.00 0.96 0.84 0.06 1.00
##  [61] 0.00 0.81 1.00 1.00 0.12 0.86 0.13 1.00 0.14 0.90 1.00 0.85 1.00 0.96 0.85
##  [76] 1.00 0.90 0.84 0.82 0.87 1.00 0.81 0.09 1.00 0.15 0.88 1.00 1.00 1.00 0.14
##  [91] 0.81 0.96 0.84 1.00 1.00 0.85 0.84 0.00 1.00 0.89 0.90 0.83 1.00 0.12 1.00
## [106] 1.00 0.95 0.85 0.84 0.00 1.00 0.00 1.00 0.10 1.00 0.91 0.85 0.88 0.06 0.88
## [121] 0.86

## [1] "----PctFam2Par"
##  [1] 0.05 0.04 0.00 0.06 0.01 0.08 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.02 0.03
## [16] 0.00 0.01 0.08 0.00 0.00 0.00 0.06 0.00 0.06 0.07 0.08 0.05 0.00 0.06

## [1] "----PctKids2Par"
##  [1] 0.05 0.05 0.00 0.02 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.03 0.00
## [16] 0.00 0.00 0.00 0.00 0.05 0.05 0.03 0.00 0.03

## [1] "----PctYoungKids2Par"
##  [1] 0.00 0.06 0.03 0.00 0.00 0.00 0.00 0.00 0.06 0.00 0.02 0.00 0.02 0.00 0.00
## [16] 0.00 0.02 0.05 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.05 0.06

## [1] "----PctTeen2Par"
##  [1] 0.10 0.05 0.00 0.09 0.09 0.00 0.00 0.03 0.11 0.00 0.10 0.00 0.04 0.10 0.10
## [16] 0.00 0.00 0.00 0.06 0.00 0.06 0.05 0.02 0.00 0.00 0.00 0.02 0.10 0.00 0.07
## [31] 0.00 0.05 0.10 0.04 0.00 0.08 0.02 0.00 0.05 0.00 0.03 0.03 0.06 0.00 0.10
## [46] 0.05 0.05 0.00 0.00 0.00

## [1] "----PctWorkMomYoungKids"
##  [1] 0.00 0.00 0.03 0.03 0.97 1.00 1.00 0.98 0.02 0.01 0.00 0.00 0.02 0.00

## [1] "----PctWorkMom"
##  [1] 0.06 1.00 0.06 0.00 0.00 0.00 0.00 0.05 0.05 0.03 0.03 0.07 0.00 0.00 0.00
## [16] 0.06 0.05 1.00 0.03 0.00 0.00 0.00 0.01 0.00 0.00 0.02 0.00 0.00 1.00

## [1] "----NumIlleg"
##   [1] 0.13 0.08 0.49 0.12 0.07 0.09 0.14 0.09 0.06 0.06 0.07 0.10 0.19 0.06 0.07
##  [16] 0.29 0.20 0.06 0.10 0.37 0.09 0.12 0.09 0.41 0.17 0.24 0.07 0.11 0.30 0.17
##  [31] 0.09 0.42 0.06 0.08 0.43 0.26 0.11 0.47 0.09 0.08 0.13 0.27 0.45 0.12 0.11
##  [46] 0.12 0.16 0.06 0.09 1.00 0.10 0.08 1.00 0.07 0.34 0.08 0.11 0.07 0.14 0.08
##  [61] 0.07 0.09 0.28 0.06 0.23 0.13 0.06 0.06 0.67 0.24 0.06 0.37 0.07 0.06 0.10
##  [76] 0.19 0.06 1.00 0.13 0.16 0.16 1.00 0.32 0.23 0.11 0.49 0.16 0.37 1.00 0.07
##  [91] 0.93 0.11 0.16 0.20 1.00 0.10 0.15 0.11 1.00 0.07 0.07 0.22 0.21 0.10 0.22
## [106] 0.06 0.06 0.08 0.10 0.06 0.10 0.06 0.06 0.09 0.07 0.06 0.06 0.08 0.06 0.07
## [121] 0.55 0.74 0.06 0.11 0.18 0.10 0.11 1.00 0.09 0.23 0.06 0.11 0.18 0.36 0.37
## [136] 0.11 1.00 0.07 0.06 0.09 0.47 0.09 0.06 0.59 0.11 0.08 0.78 0.09 0.14 0.61
## [151] 0.09 0.06 0.11 0.28 0.06 0.12 0.06 0.27 0.15 0.49 0.06 1.00 0.07 0.39 0.09
## [166] 0.22 0.72 0.07 0.18 0.06 0.41 0.06 0.38 0.08 0.20 0.55 0.29 0.10 0.12 1.00
## [181] 0.18 0.12 0.06 0.08 0.06 0.27 0.06 0.39 0.08 0.47 0.09 0.09 0.06 0.10 0.13
## [196] 0.07 0.43 0.07 0.08 0.62 0.29 0.06 0.09 0.06 0.07 0.09 0.07 0.20 0.08 0.07
## [211] 0.06 0.13 0.35 0.06 0.07 0.07 0.17 0.12 0.06 0.36 0.07 0.13 0.08 0.09 0.06
## [226] 0.08 0.08 0.12 0.34 0.34 0.97 0.09 0.11 0.22 0.14 0.13 0.06 0.45 0.15 0.16
## [241] 0.72 0.19 0.11 0.77 0.47 0.13 0.11

## [1] "----PctIlleg"
##   [1] 0.73 0.69 0.67 0.97 0.85 0.77 0.81 0.97 0.77 0.90 0.74 0.97 0.87 0.73 1.00
##  [16] 0.75 0.91 0.70 1.00 0.91 1.00 0.73 0.72 0.73 0.98 0.79 0.91 1.00 0.74 1.00
##  [31] 0.78 1.00 0.84 0.74 1.00 1.00 0.67 1.00 0.76 0.86 1.00 0.68 0.85 0.75 1.00
##  [46] 1.00 1.00 1.00 1.00 0.91 0.68 0.69 0.67 0.94 0.86 0.99 0.74 1.00 0.85 0.68
##  [61] 0.78 1.00 0.70 0.69 0.69 0.80 0.75 1.00 0.83 0.72 1.00 0.96 0.79 0.67 0.77
##  [76] 1.00 0.69 0.99 1.00 1.00 0.75 0.70 0.84 0.76 0.88 0.79 0.78 1.00 1.00 0.73
##  [91] 1.00 0.87 0.68 1.00 0.96 1.00 0.84 0.72 0.81 1.00 1.00 0.84 1.00 0.69 0.81
## [106] 0.74 1.00 0.87 0.87 0.70 0.72 1.00 0.75 0.94 1.00 0.75 0.72 0.84 0.94 1.00
## [121] 0.83 0.73 0.91 0.78 0.76 0.69 0.74 0.85 0.97 0.79 1.00 0.68 0.72 0.80 0.72
## [136] 0.67 0.78 0.70 0.74 1.00 1.00 0.69 1.00 1.00 0.81 1.00 1.00 0.74 0.85 0.83
## [151] 1.00 0.71 0.78 0.82 0.78 0.86 0.85

## [1] "----NumImmig"
##   [1] 0.28 0.14 0.16 0.06 0.10 0.23 0.11 0.07 0.08 0.25 0.14 0.29 0.14 0.21 0.15
##  [16] 0.07 0.09 0.11 0.14 0.08 0.08 0.38 0.06 0.20 0.09 0.07 0.06 0.34 0.08 0.10
##  [31] 0.06 0.70 0.07 0.06 0.11 0.45 0.27 0.07 0.10 0.19 0.20 0.16 0.13 0.34 0.58
##  [46] 0.08 0.21 0.27 0.07 0.17 0.07 0.07 0.16 0.14 0.12 0.06 0.21 0.15 0.13 0.08
##  [61] 1.00 0.06 0.09 0.09 0.07 1.00 0.07 0.06 0.07 0.07 0.14 0.06 0.09 0.07 0.27
##  [76] 0.09 0.13 0.77 0.06 0.70 0.22 0.06 0.13 0.84 0.09 0.15 0.16 0.16 0.06 0.12
##  [91] 0.06 0.26 0.14 0.06 0.06 0.09 0.10 0.11 0.07 0.19 0.13 0.06 0.08 0.16 0.08
## [106] 0.06 0.16 0.08 0.07 0.07 0.07 0.08 0.14 0.21 0.12 0.18 0.08 0.18 0.81 0.07
## [121] 0.39 0.07 0.12 0.14 0.20 1.00 0.07 0.09 0.08 0.59 0.23 0.10 0.16 1.00 0.06
## [136] 0.08 0.14 0.49 0.13 0.28 1.00 0.10 0.22 0.06 0.09 0.11 0.07 0.10 0.09 0.14
## [151] 0.14 0.41 0.15 0.07 0.57 0.06 0.18 0.11 0.17 0.32 0.13 0.10 0.06 0.06 0.11
## [166] 0.07 0.06 0.32 0.10 0.10 0.13 0.08 0.09 0.30 0.06 0.15 0.15 0.09 0.07 0.07
## [181] 0.06 0.29 0.18 0.15 0.16 0.08 0.09 0.09 0.12 0.08 0.23 0.08 0.09 0.10 0.14
## [196] 0.20 0.07 0.13 0.13 0.17 0.06 0.20 0.11 0.08 0.14 0.15 0.14 0.54 0.06 0.09
## [211] 0.12 0.06 0.12 0.07 0.18 0.24 0.08 0.16 0.24 0.13 0.34 0.08 0.07 0.10 0.06
## [226] 0.07 0.06 0.28 0.06 0.13 0.07 0.09 0.06 0.07 0.15 0.08 1.00 0.09 0.10 0.09
## [241] 0.11 1.00 0.06 0.06 0.20

## [1] "----PctImmigRecent"
##  [1] 1.00 1.00 0.93 1.00 0.94 1.00 1.00 0.87 1.00 1.00 1.00 0.96 1.00 1.00 0.91
## [16] 0.87 1.00 1.00 0.85 0.94 0.90 0.88 0.98 0.95 1.00 0.90 1.00 1.00 0.84 1.00
## [31] 1.00 0.93 0.98 0.95 1.00 0.84 1.00 1.00 1.00 1.00 0.96 0.95 0.90 1.00 1.00
## [46] 1.00 1.00 0.94 1.00 1.00 1.00 1.00 1.00 0.88 0.99 0.87 1.00 1.00 0.96 1.00
## [61] 0.89 1.00 0.94 0.92 1.00 1.00 0.89 1.00 1.00 0.96 0.93

## [1] "----PctImmigRec5"
##  [1] 0.91 0.90 1.00 0.94 1.00 0.97 0.95 1.00 0.98 0.94 0.91 1.00 1.00 0.93 1.00
## [16] 0.93 0.90 1.00 0.94 1.00 1.00 1.00 0.90 1.00 0.92 0.90 1.00 0.97 0.96 1.00
## [31] 0.97 1.00 0.95 1.00 0.90 0.98 1.00 1.00 1.00 1.00 1.00 0.91 0.99 0.91 1.00
## [46] 1.00 0.93 1.00 1.00 0.94

## [1] "----ViolentCrimesPerPop"
##   [1] 0.84 0.80 0.75 1.00 0.86 1.00 1.00 0.86 0.87 0.73 0.73 0.83 0.74 0.88 1.00
##  [16] 0.74 0.93 1.00 1.00 0.83 0.80 1.00 1.00 1.00 1.00 0.76 1.00 1.00 1.00 0.78
##  [31] 1.00 0.91 0.86 0.85 1.00 0.87 0.87 0.82 0.89 0.85 0.94 1.00 0.81 0.95 0.81
##  [46] 1.00 0.82 0.90 1.00 1.00 1.00 0.79 1.00 0.97 0.85 0.76 1.00 0.74 0.73 1.00
##  [61] 1.00 0.77 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 1.00 0.82 1.00 0.76
##  [76] 0.84 0.97 1.00 0.96 0.90 1.00 0.83 0.82 1.00 0.86 0.81 0.76 0.93 0.75 0.78
##  [91] 1.00 0.87 0.91 0.81 0.80 0.90 0.79 0.75 1.00 1.00 1.00 0.75 0.83 0.81 0.86
## [106] 1.00 0.95 0.88 1.00 0.75

Outlier Handling

# Define columns to process
columns_to_process <- c(
  "PctEmplProfServ", "PctOccupManu", "PctOccupMgmtProf", "MalePctDivorce",
  "MalePctNevMarr", "PctFam2Par", "PersPerFam", "PctKids2Par",
  "PctYoungKids2Par", "PctTeen2Par", "PctWorkMomYoungKids", "PctWorkMom",
  "NumIlleg", "PctIlleg", "PctImmigRecent", "ViolentCrimesPerPop"
)

# Create a copy of the raw data
raw_no_outliers <- raw  

# Handle outliers using IQR method
for (col_name in columns_to_process) {
  OutVals <- boxplot(raw[[col_name]])$out
  median <- median(raw[[col_name]])
  
  o3 <- ifelse(OutVals > median, OutVals, NA) %>% na.omit() %>% as.matrix() %>% t() %>% .[1,]
  o1 <- ifelse(OutVals < median, OutVals, NA) %>% na.omit() %>% as.matrix() %>% t() %>% .[1,]
  
  val <- quantile(raw[[col_name]], 0.75) + 1.5 * IQR(raw[[col_name]])
  raw[which(raw[[col_name]] %in% o3), col_name] <- val
  
  val <- quantile(raw[[col_name]], 0.25) - 1.5 * IQR(raw[[col_name]])
  raw[which(raw[[col_name]] %in% o1), col_name] <- val
}

# Combine the original and processed data
data <- rbind(raw, raw_no_outliers)

Model Training

# Specify target and feature variables
target <- "ViolentCrimesPerPop"
features <- data %>% select(-ViolentCrimesPerPop) %>% names()

# Create a formula for the model
f <- as.formula(paste(target, paste(features, collapse=" + "), sep=" ~ "))

# Fit a linear regression model using base R
glm <- glm(f, data = data)

# Display summary statistics of the model
glm %>% summary()
## 
## Call:
## glm(formula = f, data = data)
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.41675    0.04160  10.019  < 2e-16 ***
## PctEmplProfServ     -0.03087    0.01968  -1.569 0.116831    
## PctOccupManu        -0.13375    0.02018  -6.628 3.87e-11 ***
## PctOccupMgmtProf     0.01350    0.02563   0.527 0.598433    
## MalePctDivorce       0.75811    0.15309   4.952 7.65e-07 ***
## MalePctNevMarr      -0.07459    0.02004  -3.722 0.000200 ***
## FemalePctDiv         0.78018    0.19084   4.088 4.44e-05 ***
## TotalPctDiv         -1.31599    0.32603  -4.036 5.53e-05 ***
## PersPerFam           0.11925    0.02215   5.383 7.74e-08 ***
## PctFam2Par           0.31625    0.08795   3.596 0.000328 ***
## PctKids2Par         -0.84341    0.07381 -11.427  < 2e-16 ***
## PctYoungKids2Par     0.06529    0.03071   2.126 0.033545 *  
## PctTeen2Par          0.01765    0.02899   0.609 0.542695    
## PctWorkMomYoungKids  0.09316    0.03037   3.067 0.002175 ** 
## PctWorkMom          -0.16854    0.03017  -5.586 2.48e-08 ***
## NumIlleg             0.14816    0.03443   4.304 1.72e-05 ***
## PctIlleg             0.32116    0.02675  12.007  < 2e-16 ***
## NumImmig             0.16794    0.02980   5.635 1.87e-08 ***
## PctImmigRecent       0.02044    0.02303   0.888 0.374838    
## PctImmigRec5        -0.01042    0.02366  -0.440 0.659700    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.01741977)
## 
##     Null deviance: 192.668  on 3987  degrees of freedom
## Residual deviance:  69.122  on 3968  degrees of freedom
## AIC: -4812.6
## 
## Number of Fisher Scoring iterations: 2
# Check and handle multicollinearity using VIF
library(faraway)
## 
## Attaching package: 'faraway'
## The following object is masked from 'package:lattice':
## 
##     melanoma
## The following object is masked from 'package:mice':
## 
##     mammalsleep
while(glm %>% faraway::vif() %>% sort(decreasing=T) %>% .[1] >= 1.5) {
  afterVIF <- glm %>% faraway::vif() %>% sort(decreasing=T) %>% .[-1] %>% names()
  f <- as.formula(paste(target, paste(afterVIF, collapse=" + "), sep=" ~ "))
  glm <- glm(f, data = data)
}

H2O.ai Model Training

# Initialize H2O
h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         17 minutes 10 seconds 
##     H2O cluster timezone:       Asia/Baku 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.42.0.2 
##     H2O cluster version age:    5 months and 3 days 
##     H2O cluster name:           H2O_started_from_R_ACER_osn291 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.93 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.1 (2023-06-16 ucrt)
## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (5 months and 3 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html
# Convert the data to H2O format
h2o_data <- data %>% as.h2o()
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
# Split the data into training and testing sets
h2o_data <- h2o_data %>% h2o.splitFrame(seed = 123, ratios = 0.7)
train <- h2o_data[[1]]
test <- h2o_data[[2]]

# Fit a generalized linear model using H2O.ai
model <- h2o.glm(
  x = features, y = target,
  training_frame = train,
  validation_frame = test,
  seed = 123, nfolds = 10,
  lambda = 0, compute_p_values = TRUE
)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |=================================================================     |  93%
  |                                                                            
  |======================================================================| 100%
# Display coefficients and p-values
model@model$coefficients_table %>%
  as.data.frame() %>%
  dplyr::select(names, p_value) %>%
  mutate(p_value = round(p_value, 3)) %>%
  .[-1,] %>%
  arrange(desc(p_value))
##                  names p_value
## 1         PctImmigRec5   0.686
## 2     PctYoungKids2Par   0.345
## 3      PctEmplProfServ   0.239
## 4     PctOccupMgmtProf   0.236
## 5          PctTeen2Par   0.224
## 6  PctWorkMomYoungKids   0.221
## 7       PctImmigRecent   0.143
## 8           PctWorkMom   0.002
## 9         FemalePctDiv   0.001
## 10         TotalPctDiv   0.001
## 11          PctFam2Par   0.001
## 12        PctOccupManu   0.000
## 13      MalePctDivorce   0.000
## 14      MalePctNevMarr   0.000
## 15          PersPerFam   0.000
## 16         PctKids2Par   0.000
## 17            NumIlleg   0.000
## 18            PctIlleg   0.000
## 19            NumImmig   0.000
# Remove features with high p-values
while (model@model$coefficients_table %>%
      as.data.frame() %>%
      dplyr::select(names, p_value) %>%
      mutate(p_value = round(p_value, 3)) %>%
      arrange(desc(p_value)) %>%
      .[1, 2] > 0.05) {
  model@model$coefficients_table %>%
    as.data.frame() %>%
    dplyr::select(names, p_value) %>%
    mutate(p_value = round(p_value, 3)) %>%
    filter(!is.nan(p_value)) %>%
    .[-1,] %>%
    arrange(desc(p_value)) %>%
    .[1, 1] -> v
  features <- features[features != v]
  train <- train %>% as.data.frame() %>% select(target, features) %>% as.h2o()
  test <- test %>% as.data.frame() %>% select(target, features) %>% as.h2o()
  model <- h2o.glm(
    x = features, y = target,
    training_frame = train,
    validation_frame = test,
    nfolds = 10, seed = 123,
    lambda = 0, compute_p_values = TRUE
  )
}
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(target)
## 
##   # Now:
##   data %>% select(all_of(target))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(features)
## 
##   # Now:
##   data %>% select(all_of(features))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
# Display final coefficients and p-values
model@model$coefficients_table %>%
  as.data.frame() %>%
  dplyr::select(names, p_value) %>%
  mutate(p_value = round(p_value, 3))
##             names p_value
## 1       Intercept   0.000
## 2    PctOccupManu   0.000
## 3  MalePctDivorce   0.000
## 4  MalePctNevMarr   0.000
## 5    FemalePctDiv   0.001
## 6     TotalPctDiv   0.001
## 7      PersPerFam   0.000
## 8      PctFam2Par   0.000
## 9     PctKids2Par   0.000
## 10     PctWorkMom   0.000
## 11       NumIlleg   0.001
## 12       PctIlleg   0.000
## 13       NumImmig   0.000
## 14 PctImmigRecent   0.023

Model Evaluation

# Predict on the test set
# Predict on the test data using the trained model
y_pred <- model %>% h2o.predict(newdata = test) %>% as.data.frame()
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
# Extract the predicted values
y_pred$predict
##    [1]  0.3276683542  0.2357919834  0.0534537913  0.3494565171  0.2311271800
##    [6]  0.1663099875  0.4968704404  0.1105342632  0.3431871104  0.0699129828
##   [11]  0.2543534999  0.1522864310  0.2265534135  0.1568405943  0.2370203774
##   [16]  0.1717416861  0.3093523825  0.0943840172  0.0380896333  0.3137111595
##   [21]  0.3735478821  0.4210084681  0.1790765208  0.2727645779  0.0113056871
##   [26]  0.1020025277  0.0941117165  0.2059295812  0.0341031059  0.0714439365
##   [31]  0.1246059133  0.0687818392  0.0020445651  0.5547666139  0.0369303978
##   [36]  0.1412200806  0.2981599461  0.0781168390  0.0944174435  0.3141790815
##   [41]  0.0614555998  0.5755999923  0.0761410282  0.0963628881  0.0638141011
##   [46]  0.0551363678  0.5520727653  0.1046855850  0.1466760871  0.0837259969
##   [51]  0.2369082059  0.0692056995  0.2336953908  0.0299392204  0.2254516582
##   [56]  0.1381375736  0.3478849005  0.2876897546  0.1311932085  0.3368332979
##   [61]  0.0547026835  0.2675594967  0.1728135743  0.3023737963  0.3090218634
##   [66]  0.0539553052  0.3892380906  0.4583553896  0.0570486367  0.1400643142
##   [71]  0.0229954964  0.0470845004  0.2202217102  0.4114349995  0.1673957317
##   [76]  0.2747396653  0.2180876111  0.2604723952  0.2532287086  0.5549083013
##   [81]  0.3705625842  0.0388835193  0.1356754448  0.2137891553  0.0079981088
##   [86]  0.0858347013  0.3153278757  0.1462209026  0.1354191597  0.3351150322
##   [91]  0.4704790036  0.0650049035  0.3551666480  0.1315347526  0.3969806842
##   [96]  0.0977206838  0.3876120435  0.3359113267  0.2103092377  0.1794680227
##  [101]  0.2924862601  0.3482652429  0.3473434877  0.2996989158  0.1324205085
##  [106]  0.1901589285  0.0419287557  0.0148407050  0.0883544721  0.1385576219
##  [111]  0.0913877109  0.1665406677  0.4011130941  0.2224591260  0.0513132963
##  [116]  0.0675738477  0.0813112195  0.0304761396  0.0640742400  0.1543668840
##  [121]  0.2884739374  0.1300724883  0.0657865549  0.3110713403  0.5133754027
##  [126]  0.1455118664  0.4875738379  0.2735926883  0.0584550837  0.1262153131
##  [131]  0.3516521412  0.4850977849  0.4521758390  0.3442535514  0.6757147397
##  [136]  0.0444127106  0.0677609280  0.3294417066  0.5120019660  0.1433780575
##  [141]  0.1044507622  0.2103058240  0.0977651488  0.1001903777  0.5570822409
##  [146]  0.1697696525  0.0747019851  0.4752714882  0.6399772549  0.1530577168
##  [151]  0.3253144310  0.2497426563  0.0296503574  0.3988087778  0.3069890281
##  [156]  0.3159295879  0.1749361760  0.0524786024  0.3099293263  0.0122147599
##  [161]  0.5880439710  0.0681995221  0.4072589825  0.2319517954  0.4863278870
##  [166]  0.3392150902  0.2595194494  0.5889684794  0.2200985943  0.3447206292
##  [171]  0.1150339769  0.0777016125  0.1241559956  0.2600617556  0.2200678684
##  [176]  0.2158727729  0.0814623902  0.1075625724  0.1203224189  0.2348388199
##  [181]  0.0651978778  0.0789719258  0.1719476584  0.0188340895  0.3815307602
##  [186]  0.2503108053  0.4513658132  0.5421933530  0.3308912793  0.1522005217
##  [191]  0.0227095544  0.1834173575  0.2466782480  0.3365900681  0.2133197269
##  [196]  0.5204893140  0.1099211078  0.0842297078  0.0379865908  0.0957923767
##  [201]  0.2544125646  0.1264668125  0.3203180269  0.0294087084 -0.0028648480
##  [206]  0.0185609189  0.0324773834  0.1587637375  0.0947755550  0.3390141137
##  [211]  0.2703703891  0.2965136942  0.6471980832  0.4202904297  0.4578554628
##  [216]  0.4927880290  0.3041244422  0.1821949766  0.4293715308  0.0186976418
##  [221]  0.2572416822  0.0587745040  0.2977843403  0.0825300133  0.0194179674
##  [226]  0.2463686388  0.4774115493  0.5826297422  0.1500879937  0.1224696925
##  [231]  0.2192468046  0.4714911006  0.2181169294  0.0523225690  0.1612151807
##  [236]  0.0924366499  0.0554773026  0.0944655453  0.3041343563  0.1046919552
##  [241]  0.1936317844  0.3052829976  0.3644097647  0.0144145152  0.2078642557
##  [246]  0.2334014141  0.1308524708  0.5075264509  0.5585203882  0.1063400872
##  [251]  0.0182096255  0.1343598988  0.3763869504  0.1858552064  0.3051480184
##  [256]  0.2499407076  0.1764422378  0.1341358432  0.4367564428  0.0973136152
##  [261]  0.1845645683 -0.0197551864  0.1977286220  0.2277634648  0.4033131201
##  [266]  0.2608377433  0.0891652255  0.1987313813  0.2843607788  0.4086302008
##  [271]  0.0557489774  0.0844502459  0.1070084184  0.1949653745  0.0479754894
##  [276]  0.1960007026  0.0544028836  0.1613061753  0.2653254548  0.3850871659
##  [281]  0.0630404883  0.1327992291  0.0312025472  0.5634427937  0.1121044594
##  [286]  0.2475003974  0.2878162049  0.2210966498  0.0247305616  0.4179597215
##  [291]  0.1785562217  0.0557874060  0.2339030978  0.1231292890  0.0676302165
##  [296]  0.3181896926  0.4377270478  0.0232980541  0.0316605627  0.2356710598
##  [301]  0.6447967640  0.1469607892  0.1864727321  0.0420495325  0.1217105303
##  [306]  0.4198279109  0.1930846304  0.6453356642  0.4453948596  0.2272534676
##  [311]  0.2555551299  0.0903560194  0.0646704646  0.5053467312  0.2592608600
##  [316]  0.4020183175  0.3000640275  0.7458402928  0.0897619464  0.2474094577
##  [321]  0.5427007017  0.2876150711  0.2143168365 -0.0041707412  0.1844005276
##  [326]  0.0926341460  0.2566742750  0.3969003253  0.4016559791  0.0148170950
##  [331]  0.2363206135  0.1488048674  0.0553313377  0.0577220356  0.0920355135
##  [336]  0.2296516956  0.5211923677  0.3042392236  0.2366508021  0.1491577637
##  [341]  0.1605046982  0.0522991784  0.0420106974  0.1269672618  0.2682411782
##  [346]  0.3979552415  0.0990870127  0.2008606764  0.0033276494  0.5398004211
##  [351]  0.3077190620  0.0830648248  0.1674135047  0.4433401803  0.2246707448
##  [356]  0.0774024180  0.4013785429  0.2634401112 -0.0007724021  0.2401818061
##  [361]  0.2700382090  0.0098386143  0.1370769294  0.0614108253  0.0780512212
##  [366] -0.0058465042  0.0144134348  0.0993921307  0.1934008487  0.2616313157
##  [371]  0.0823037484  0.2758802551  0.1027501223  0.5152014743  0.2377389797
##  [376]  0.1631006004  0.2980801329  0.0333643276  0.0245769741  0.2781861285
##  [381]  0.3618580913  0.1774946164  0.2259599059  0.5303274154  0.2603019970
##  [386]  0.4575556231  0.3687723842  0.3198960690  0.1556149093  0.1137579109
##  [391]  0.0715189097  0.0530945821  0.0628233407  0.1889443591  0.0313787087
##  [396]  0.0700015994  0.0373000242  0.1703089569  0.1324141402  0.0584845623
##  [401]  0.3722915917  0.1324596395  0.2784955876  0.2303506567  0.2398224397
##  [406]  0.0143304831  0.1193675138  0.0613240491  0.5342946235  0.2141984540
##  [411]  0.3813439682 -0.0389484575  0.6287908017  0.1061597046  0.0237036033
##  [416]  0.4346885389  0.0073361885  0.1143876266  0.1921423827  0.3083370069
##  [421]  0.3157155966  0.2879546192  0.0322758963  0.1378929663  0.2566997238
##  [426]  0.5031482378  0.5451388562  0.0131674517  0.0208843706  0.3201456647
##  [431]  0.1918028429  0.4081394788  0.3321069097  0.2405460187  0.1244057660
##  [436]  0.0630750414  0.2940231596  0.1415497141  0.1242023077  0.2773677955
##  [441]  0.5129845646  0.5042148347  0.0198417849  0.2038218447  0.4046668379
##  [446]  0.2321575159  0.1705484457  0.4346071829  0.4126313425  0.5207212974
##  [451]  0.1290190690  0.1419834013  0.3464499890  0.0918739549  0.1801397306
##  [456]  0.1050806504  0.1037864283  0.4827270573  0.0826790551  0.3212171067
##  [461]  0.0875871935 -0.0169912507  0.0796132366  0.3223482610  0.0844822971
##  [466]  0.0683803610  0.5676598277  0.2078135813  0.2353955276  0.0638769902
##  [471]  0.4434760203  0.1912741982  0.0034685054  0.4871913718  0.2807007551
##  [476]  0.1904795260  0.5197884787  0.0481477635  0.3363011675  0.1083048867
##  [481]  0.5658682796  0.3912134694  0.2257445792  0.0048804348  0.1195444332
##  [486]  0.0863810981  0.5991302995  0.4608903990  0.2264640960  0.1330355944
##  [491]  0.5416833080  0.5514325010  0.0554357750  0.3709822074  0.3735295898
##  [496]  0.0482784827  0.2987401973  0.1399053044  0.4672553308  0.0080654682
##  [501]  0.4782980022  0.1436380920  0.2323510088  0.4212486632  0.0825132000
##  [506]  0.3243257872  0.4293463247  0.1215823064  0.1603904834  0.0662628972
##  [511]  0.0566003361  0.3762516787  0.1464655639  0.0732313655  0.2598614151
##  [516]  0.5795170438  0.2136955152  0.3914829509  0.1741895138  0.5124184002
##  [521]  0.1239881661  0.1485055115  0.0412428732  0.4725326705  0.2224613131
##  [526]  0.1019409261  0.3390401759  0.5540213273  0.3055388577  0.2769173656
##  [531]  0.0405204643  0.2865107108  0.2132127886  0.1281593244  0.0363793699
##  [536]  0.1003308967  0.6724537800  0.4035266707  0.2542470477  0.4788257740
##  [541]  0.2029103524  0.6832358724  0.0790981055  0.1813502617  0.2376983393
##  [546]  0.0849069008  0.2231191857  0.3144856394  0.1650242244  0.2992959201
##  [551]  0.1323941221  0.1798158435  0.2359940543  0.0658562688  0.0867287315
##  [556]  0.1873501292  0.3487606603  0.0994161787  0.2112989519  0.1710237542
##  [561]  0.0978060347  0.3027364421  0.1299602054  0.5774249267  0.2227057837
##  [566]  0.2462094026  0.4642187439  0.2950640445  0.1796621541  0.3843816428
##  [571]  0.0626102163  0.0656780350  0.0919567097  0.0290284649  0.2374671960
##  [576]  0.2155007815  0.0934027529  0.1986447380  0.5426562451  0.3225957696
##  [581]  0.2614802838  0.1785278402  0.1798598739  0.3496668304  0.0388582929
##  [586]  0.0417841843  0.1230392633  0.4014595142  0.0426371837  0.4019187729
##  [591]  0.2311271800  0.1663099875  0.5155289271  0.0915050465  0.4240997755
##  [596]  0.0699129828  0.1373525919  0.2727090330  0.1522864310  0.0676447542
##  [601]  0.2237493900  0.3152354464  0.3137111595  0.1790765208  0.0454784599
##  [606]  0.1526004668  0.3674611780  0.2393688577  0.0113056871  0.1221146459
##  [611]  0.0955806912  0.1309411307  0.2494746448  0.0320134209  0.2291831699
##  [616]  0.2919884331  0.5245990330  0.1045488578  0.2196790544  0.0020445651
##  [621]  0.2696640269  0.2981599461  0.0781168390  0.0879482646  0.3261985674
##  [626]  0.1631398959  0.0761410282  0.2993509777  0.2182627918  0.5900770685
##  [631]  0.4012131951  0.6431760922  0.0709717584  0.3397909515  0.1046855850
##  [636]  0.1381535636  0.0837259969  0.0718952331  0.3987253088  0.2336953908
##  [641]  0.2927413717  0.0939959898  0.3594004123  0.2254516582  0.0229006061
##  [646]  0.2675594967  0.1728135743  0.4087116802  0.2371036213  0.3294549896
##  [651]  0.0530993975  0.2128387416  0.1400643142  0.0470845004  0.0796131014
##  [656]  0.0843203687  0.0380190442  0.3323814378  0.4068764790  0.2604723952
##  [661]  0.1419926016  0.4121031913  0.0199957961  0.2532287086  0.5708178820
##  [666]  0.2017649189  0.1356754448  0.2137891553  0.3153278757  0.3978166530
##  [671]  0.1005276326  0.0540290959  0.2702925651 -0.0149592889  0.3745533110
##  [676]  0.0854935589  0.0222020494  0.2206530778  0.1457124859  0.2544434743
##  [681]  0.3192285048  0.3232566515  0.0391178207  0.1328095065  0.1817039516
##  [686]  0.3997006235  0.1198546749  0.3515366585  0.1405594762  0.3226232675
##  [691]  0.1173185439  0.2996989158  0.0419287557  0.8201829294  0.0844963802
##  [696]  0.0404449217  0.2410453271  0.1385576219  0.0151777550  0.0362605688
##  [701]  0.1559252436  0.2290648691  0.0640742400  0.6975932982  0.3110713403
##  [706]  0.5417379232  0.1917444523  0.2682852481  0.2678136175  0.0807150683
##  [711]  0.4762566756  0.3115231455  0.2500932847  0.3111373089  0.4947133736
##  [716]  0.2141609809  0.3915933466  0.9466201536  0.5280201306  0.0350040023
##  [721]  0.4786463045  0.0444127106  0.1433780575  0.1044507622  0.2103058240
##  [726]  0.0514541419  0.7529642073  0.3311974950  0.5820787648  0.3387319322
##  [731]  0.3069890281  0.1817459568  0.1411888082  0.4344983315  0.1749361760
##  [736]  0.2825424331  0.0143664032  0.0387139175  0.4191322254  0.3099293263
##  [741]  0.0122147599  0.1184927901  0.1872122037  0.3251889191  0.3480610003
##  [746]  0.0919776528  0.5775153780  0.1112072613  0.4737946628  0.2200985943
##  [751]  0.3447206292  0.5302459822  0.4641708952  0.2919630391  0.1511507103
##  [756]  0.1241559956  0.1218278853  0.2600617556  0.2208417397  0.0971961808
##  [761]  0.0417653957  0.1075625724  0.6563288637  0.3376873255  0.1239416432
##  [766]  0.1665500950  0.0453673577  0.2546715084  0.3815307602  0.3841530940
##  [771]  0.5421933530  0.1522005217  0.3345982528  0.3365900681  0.2882648456
##  [776]  0.8213235814  0.7881302226  0.2545480448  0.0957923767  0.4238654678
##  [781]  0.1260113504  0.1264668125  0.1096841534  0.0294087084  0.3822485061
##  [786]  0.0828366863  0.1555816237  0.0324773834  0.2043093203  0.3551925395
##  [791]  0.5113723599  0.2965136942  0.1786926106  0.8370494134  0.2037242657
##  [796]  0.0796954009  0.7885382076  0.4817127334  0.1912323214  0.3186047471
##  [801]  0.1730460698  0.2657430621  0.1657608494  0.1559138488  0.7417171563
##  [806]  0.7064872703  0.1224696925  0.2192468046  0.6511514153  0.0322655135
##  [811]  0.1227532759  0.1612151807  0.3009524507  0.0880247122  0.1046919552
##  [816]  0.1453489316  0.4074345782  0.3644097647  0.4682137153  0.1861190639
##  [821]  0.2334014141  0.8265211373  0.3905325686  0.1870064806  0.0395631953
##  [826] -0.0059291212  0.1324835687  0.2733459155  0.0662892754  0.1567532194
##  [831]  0.3195232577  0.3276688149  0.1824060954  0.6859853581  0.3051480184
##  [836]  0.2516754465  0.1018390610  0.1341358432  0.0953975928  0.6125567477
##  [841]  0.0973136152  0.5438283775  0.1353562622  0.0515667770  0.0593339287
##  [846]  0.1300424202  0.1818416043  0.2998613357  0.0323651479  0.1987313813
##  [851]  0.2535170721  0.0899245227  0.4783473373  0.4566790039  0.4086302008
##  [856]  0.0557489774  0.1449744894  0.1566643255  0.0798451640  0.3654225454
##  [861]  0.1949653745  0.0479754894  0.0090304145  0.1613061753  0.1455932183
##  [866]  0.3850871659  0.1327992291  0.0681990396  0.3679246665  0.5774786688
##  [871]  0.0312025472  0.6638401556  0.1121044594  0.2657715658  0.2475003974
##  [876]  0.2878162049  0.4179597215  0.1951023890  0.0328216603  0.1264353401
##  [881]  0.4610250233  0.0676302165  0.1865246215  0.4799839336  0.0506646077
##  [886]  0.1224290989  0.0849733285  0.0908536746  0.0315805919  0.0420495325
##  [891]  0.1582325867  0.2151444653  0.0831963897  0.1295088510  0.3297232774
##  [896]  0.1330978651  0.2395036527  0.0905795702  0.4646260372  0.2555551299
##  [901]  0.2740781616  0.5464084553  0.0124037050  0.1754829958  0.0235861285
##  [906]  0.0756822182  0.0295486897  0.0897619464  0.2471117637  0.3212570278
##  [911]  0.5759229975  0.3616064797  0.0572640402  0.1124111165  0.2934981351
##  [916]  0.2782775451  0.4315518511  0.1425013949  0.1257227421  0.1844005276
##  [921]  0.2983512747  0.0104702772  0.2797949764  0.1488048674  0.0920355135
##  [926]  0.1545328952  0.2791166462  0.0753886732  0.1810004025  0.0933343056
##  [931]  0.0433184893  0.3042392236  0.0522991784  0.0420106974  0.2303541778
##  [936]  0.2182122081  0.1336507939  0.0990870127  0.2946345411  0.5800524808
##  [941]  0.0797759169  0.5398004211  0.1545460891  0.1177699565  0.2060130012
##  [946]  0.8000745592  0.3730611767  0.0297629595  0.3433650862  0.4433401803
##  [951]  0.2663341689  0.0665190749  0.0615590076  0.0055733783  0.0745237563
##  [956]  0.3087753671  0.5647669107  0.0206777769  0.1575714731  0.4105309903
##  [961]  0.0780512212  0.1060556398  0.4185156737  0.0512461878  0.2742669239
##  [966]  0.1654669473  0.0718753641  0.0197811247  0.3484296951  0.2418402030
##  [971]  0.9334025992  0.2377389797  0.3045897156  0.0319542688  0.3055747452
##  [976]  0.6563036320  0.7050489159  0.2781861285  0.3618580913  0.0789962940
##  [981]  0.1460169625  0.1253071125  0.3518574244  0.3514983893  0.0346459836
##  [986]  0.3052371771  0.3391272465  0.1154535305  0.3729714163  0.4887477022
##  [991]  0.5715008257  0.0715189097  0.0530945821  0.3632962005  0.0975164514
##  [996]  0.1889443591  0.0654536098  0.2387735069  0.0997226848  0.1324141402
## [1001]  0.1491367435  0.2529002743  0.0513740802  0.0143304831  0.2025549750
## [1006]  0.0613240491  0.3196153113  0.1248726681  0.5843439810  0.0650210088
## [1011]  0.0432874638  0.2898759322  0.3996350751  0.6332030997  0.3412460240
## [1016]  0.4346885389  0.1833081411  0.4076900417  0.0821985754  0.1143876266
## [1021]  0.1837287425  0.0548118905  0.3778906464  0.0875356807  0.3157155966
## [1026]  0.4887531777  0.0616432585  0.2330606627  0.2566997238  0.2096893330
## [1031]  0.4459638514  0.1575473653  0.2756395414  0.0468350697  0.0641222234
## [1036]  0.3274994947  0.2711466340  0.0288918054  0.4697714831  0.3321069097
## [1041]  0.8539435070  0.1244057660  0.7911102551  0.2006513024  0.0766123917
## [1046]  0.3821446374  0.5423773029  0.5068893693  0.2038218447  0.0090226102
## [1051]  0.1169037061  0.2495169227  0.1003841189  0.3068825029  0.1219338321
## [1056]  0.4170436404  0.5141983686  0.1146504376  0.0918739549  0.0975314880
## [1061]  0.1050806504  0.2931192079  0.1737944669  0.0486310449  0.0393583907
## [1066]  0.4827270573  0.2775830276  0.2427191022  0.2389811694  0.0796132366
## [1071]  0.3223482610  0.0844822971  0.5988667380  0.2078135813  0.2511815220
## [1076]  0.2798116341  0.1430251561  0.3905951161  0.0569583048  0.3005063363
## [1081]  0.1083048867  0.7013572698  0.1128689524  0.3439401633  0.0048804348
## [1086] -0.0197448583  0.6416627896  0.6759499926  0.1728234563  0.0459195373
## [1091]  0.2264640960  0.0304273523  0.1330355944  0.0686542338  0.1610076327
## [1096]  0.5987206950  0.2197431527  0.3830016934  0.2276727503  0.0876982101
## [1101]  0.0813622411  0.4843527692  0.0367085566  0.1462900941  0.4212486632
## [1106]  0.0039405857  0.3221246590  0.1204908367  0.0631951156 -0.0062551549
## [1111]  0.0657170259  0.5080309805  0.4268972368  0.3502252094  0.2550536207
## [1116]  0.0912200074  0.1741895138  0.0080003925  0.6315964463  0.4059195934
## [1121]  0.4879779448  0.0413627975  0.2432439696  0.4479824960  0.6454804583
## [1126]  0.3055388577  0.0593053925  0.0405204643  0.2429783782  0.0471484724
## [1131]  0.0626286112  0.2144512648  0.1003308967  0.7264279909  0.4461788842
## [1136]  0.3581550137  0.0790981055  0.7455276412  0.2879377608  0.0211275610
## [1141]  0.0569399981  0.6522593673  0.0427567141  0.1392799007  0.2992959201
## [1146]  0.5988967979  0.7538561769  0.0658562688  0.0687731136  0.1710237542
## [1151]  0.1169782483  0.1895821982  0.1681123160  0.3027364421  0.1299602054
## [1156]  0.1176579771  0.1221066804  0.6510377867  0.1281656391  0.2389977110
## [1161]  0.5139111161  0.2198040332  0.3939972316  0.0170089790  0.0614087039
## [1166]  0.2551753959  0.1751490036  0.1396232336  0.3412700870  0.1986447380
## [1171]  0.0249513309  0.6069898225  0.4152720151  0.1785278402  0.0552466720
## [1176]  0.0587964402  0.2506116193  0.0757989306 -0.0076337076  0.0049113171
## [1181]  0.1230392633  0.1420917793
# Convert the test set to a data frame
test_set <- test %>% as.data.frame()

# Calculate residuals (difference between observed and predicted values)
residuals <- test_set$ViolentCrimesPerPop - y_pred$predict

# Calculate Root Mean Squared Error (RMSE)
RMSE = sqrt(mean(residuals^2))
RMSE
## [1] 0.1287954
# Calculate mean of the observed values in the test set
y_test_mean = mean(test_set$ViolentCrimesPerPop)

# Calculate Total Sum of Squares (TSS) and Residual Sum of Squares (RSS)
tss = sum((test_set$ViolentCrimesPerPop - y_test_mean)^2)
rss = sum(residuals^2)

# Calculate R-squared (coefficient of determination)
R2 = 1 - (rss/tss)
R2
## [1] 0.6644794
# Obtain sample size and number of independent variables
n <- test_set %>% nrow() # sample size
k <- features %>% length() # number of independent variables

# Calculate Adjusted R-squared
Adjusted_R2 = 1 - (1 - R2) * ((n - 1) / (n - k - 1))

# Create a tibble with RMSE, R2, and Adjusted R2
tibble(RMSE = round(RMSE, 1),
       R2, Adjusted_R2)
## # A tibble: 1 × 3
##    RMSE    R2 Adjusted_R2
##   <dbl> <dbl>       <dbl>
## 1   0.1 0.664       0.661
# Combine predicted and observed values into a data frame
my_data <- cbind(predicted = y_pred$predict,
                 observed = test_set$ViolentCrimesPerPop) %>% as.data.frame()

# Create a scatter plot with a regression line
g <- my_data %>%
  ggplot(aes(predicted, observed)) +
  geom_point(color = "red") +
  geom_smooth(method = lm) +
  labs(x = "Predicted Crime rate",
       y = "Observed Crime rate",
       title = glue('Test: Adjusted R2 = {round(enexpr(Adjusted_R2), 2)}')) +
  theme(plot.title = element_text(color = "darkgreen", size = 16, hjust = 0.5),
        axis.text.y = element_text(size = 12),
        axis.text.x = element_text(size = 12),
        axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14))

# Convert ggplot to plotly
g %>% ggplotly()
## `geom_smooth()` using formula = 'y ~ x'

Train Set Evaluation and Visualization

# Make predictions on the train set.
y_pred_train <- model %>% h2o.predict(newdata = train) %>% as.data.frame()
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |======================================================================| 100%
# Extract the train set as a data frame.
train_set <- train %>% as.data.frame()

# Calculate residuals and Root Mean Squared Error (RMSE) for the train set.
residuals <- train_set$ViolentCrimesPerPop - y_pred_train$predict
RMSE_train = sqrt(mean(residuals^2))

# Calculate the mean of the target variable in the train set.
y_train_mean = mean(train_set$ViolentCrimesPerPop)

# Calculate Total Sum of Squares (tss) and Residual Sum of Squares (rss).
tss = sum((train_set$ViolentCrimesPerPop - y_train_mean)^2)
rss = sum(residuals^2)

# Calculate R-squared (R2) for the train set.
R2_train = 1 - (rss/tss)

# Calculate Adjusted R-squared for the train set.
n <- train_set %>% nrow()
k <- features %>% length()
Adjusted_R2_train = 1 - (1 - R2_train) * ((n - 1) / (n - k - 1))

# Create a data frame with predicted and observed values for the train set.
my_data_train <- cbind(predicted = y_pred_train$predict, observed = train_set$ViolentCrimesPerPop) %>% 
  as.data.frame()  

# Create a scatter plot with regression line for the train set.
g_train <- my_data_train %>% 
  ggplot(aes(predicted, observed)) + 
  geom_point(color = "darkred") + 
  geom_smooth(method = lm) + 
  labs(x = "Predicted Crime rate", 
       y = "Observed Crime rate",
       title = glue('Train Set Evaluation: Adjusted R2 = {round(Adjusted_R2_train, 2)}')) +
  theme(plot.title = element_text(color = "darkgreen", size = 16, hjust = 0.5),
        axis.text.y = element_text(size = 12), 
        axis.text.x = element_text(size = 12),
        axis.title.x = element_text(size = 14), 
        axis.title.y = element_text(size = 14))  

# Convert the ggplot to plotly for interactive visualization.
g_train %>% ggplotly()
## `geom_smooth()` using formula = 'y ~ x'
# Combine the train set plot with the previous plot (g) using patchwork.
library(patchwork)
g_train + g 
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

## Final Summary

# Create a tibble with summary metrics for the train and test sets.
summary_metrics <- tibble(
  RMSE_train = round(RMSE_train, 1),
  RMSE_test = round(RMSE, 1),
  Adjusted_R2_train,
  Adjusted_R2_test = Adjusted_R2
)

# Print the final summary metrics.
summary_metrics
## # A tibble: 1 × 4
##   RMSE_train RMSE_test Adjusted_R2_train Adjusted_R2_test
##        <dbl>     <dbl>             <dbl>            <dbl>
## 1        0.1       0.1             0.626            0.661